# This file contains the code to create EUETS_innovation_public.Rdata, a collection of data sets used in the analysis reported in sections 3, 4, 5, and appendices D and E of the article ``Environmental Policy and Directed Technological Change: Evidence from the European carbon market''.

############### Preamble ###############
require(foreign)
require(Matching)
rm(list = ls())
master <- read.dta("ETS_innov_matching_data.dta")

############### Pre-process data for matching ############### 
# Split the master data set into country data sets to reduce the computational burden of matching. For matching EU ETS firms to firms in non-ETS countries, create data sets containing only EU ETS firms and allowable matches.
data <- c("AT", "BE", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "IE", "LT", "LU", "NL", "PL", "PT", "SE", "SK", "nonETS", "US")
data <- c("AT", "BE", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "IE", "LT", "LU", "NL", "PL", "PT", "SE", "SK", )

for (i in seq_along(data)) {
	name <- data[i]
	if (name=="nonETS") { ds <- master[master$ETS==1 | master$country=="BG" | master$country=="CH" | master$country=="NO" | master$country=="RO",] }
	if (name=="US") { ds <- master[master$ETS==1 | master$country=="US",] }
	if (name!="nonETS" & name!="US") { ds <- master[master$country==data[i],] }
	## Exclude ETS-related controls
	ds <- ds[ds$ETS==1 | (ds$ETS==0 & ds$ETS_related==0),] 
	## Exclude firms in sectors (3-digit NACE Rev. 2) with no treated firms
	ds$nace3 <- as.numeric(substr(ds$nace, 1, 3))
	sector <- unique(ds$nace3[ds$ETS==1])
	sector_count <- matrix(nrow=0,ncol=3)
	colnames(sector_count) <- c("nace3","t","c")
	for (j in 1:length(sector)) {
		sector_count <- rbind(sector_count, cbind(sector[j], sum(ds$ETS[ds$nace3==sector[j]]), nrow(ds[ds$ETS==0 & ds$nace3==sector[j],])))
	}
	sector_count <- as.data.frame(sector_count)
	sector <- subset(sector, sector_count$t>0 & sector_count$c>0)
	ds <- subset(ds, ds$nace3 %in% sector)
	rm("sector_count","sector")
	rownames(ds) <- 1:nrow(ds)
	name <- paste(data[i], "_d", sep="")
	assign(name, ds)
}
rm("data", "ds", "i", "j", "master", "name")
save.image(file="matching_data.RData")

# Split data set into country data sets for co-patenting analysis. Note that co-patenters may be present in any one of the countries in our data set, not just in countries that launched the EU ETS in 2005.
data <- c("AT", "BE", "BG", "CH", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "IE", "LT", "LU", "NL", "NO", "PL", "PT", "RO", "SE", "SK", "US")
for (i in seq_along(data)) {
	name <- data[i]
	ds <- master[master$country==data[i],]
	## Exclude ETS-related controls
	ds <- ds[ds$copatent==1 | (ds$copatent==0 & ds$ETS_related==0),]
	## Exclude firms in sectors (3-digit NACE Rev. 2) with no treated firms
	ds$nace3 <- as.numeric(substr(ds$nace, 1, 3))
	sector <- unique(ds$nace3[ds$copatent==1])
	if (length(sector)==0) { next }
	sector_count <- matrix(nrow=0,ncol=3)
	colnames(sector_count) <- c("nace3","t","c")
	for (j in 1:length(sector)) {
		sector_count <- rbind(sector_count, cbind(sector[j], sum(ds$copatent[ds$nace3==sector[j]]), nrow(ds[ds$copatent==0 & ds$nace3==sector[j],])))
	}
	sector_count <- as.data.frame(sector_count)
	sector <- subset(sector, sector_count$t>0 & sector_count$c>0)
	ds <- subset(ds, ds$nace3 %in% sector)
	rm("sector_count","sector")
	rownames(ds) <- 1:nrow(ds)
	name <- paste(data[i], "_d", sep="")
	assign(name, ds)
}
rm("data", "ds", "i", "j", "master", "name")
save.image(file="matching_data_copatent.RData")


############### Matching ###############
# The article reports results from 6 matched data sets: the main specification discussed in subsections 4.1. and 4.2., 4 alternative matching specifications discussed in section 4.3. and appendix E, and a matched set of co-patenters discussed in section 5. The code below constructs these 6 matched data sets from the pre-processed data.
# Matched controls are always replaced. All 'ties' are kept and averaged.
# Matching relies on GenMatch, a stochastic algorithm that will not necessarily replicate the matched output exactly each time the code is run. We provide the matched output from our run in EUETS_innovation_public.Rdata, stripped of any data held under license.

# Main specification: 
## (1) EU ETS firms matched to non-EU ETS firms in the same country and sector. Treatment year 2005, Standard EPO definition, Using average revenues from 2001-2004 as a covariate.

# Alternative specifications:
## (2) As main specification, except that average revenues are measured up to and including 2005.
## (3) As main specification, except that the treatment year is assumed to be 2003, so that all covariates are measured up to and including 2002 only.
## (4) As main specification, except that non-EU ETS firms are based in BG, CH, NO, RO.
## (5) As main specification, except that non-EU ETS firms are based in US.
## (6) As main specification, except that EU ETS firms are substituted for their co-patenters and non-EU ETS firms are substituted for firms not holding a joint patent with an EU ETS firm.

rm(list = ls())
load(file="matching_data.RData")

for (i in 1:5) {	 
	if (i<3) { data <- c("AT", "BE", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "IE", "LT", "LU", "NL", "PL", "PT", "SE", "SK") }
	if (i==3) { data <- c("AT", "BE", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "IE", "LT", "NL", "PL", "PT", "SE", "SK") } # LU does not have any EU ETS firms with sufficient data to match under the matching criteria set for specification 3, so it is excluded.
	if (i==4) { data <- "nonETS" }
	if (i==5) { data <- "US" }
	
	for (j in seq_along(data)) {
		# Read in data
		ds <- get(paste(data[j], "_d", sep=""))
		
		# Exclude firms where revenue data is insufficient to match
		if (i<2 | i>3) { ds <- ds[ds$revenue0104!=-3,] } # Exclude firms without pre-2005 revenue data
		if (i==2) { ds <- ds[ds$revenue0105!=-3,] } # Exclude firms without pre-2006 revenue data
		if (i==3) { ds <- ds[ds$revenue0102!=-3,] } # Exclude firms without pre-2003 revenue data
		
		# Create additional matching variables
		if (i!=3) {
			ds$patents <- 0
			ds$patents[ds$epo_total_pat7889>0 | ds$epo_total_pat9099>0 | ds$epo_total_pat0002>0 | ds$epo_total_pat0304>0] <- 1 # A binary variable indicating whether or not the firm has ever filed a patent at the EPO.
			ds$green_pat <- 0
			ds$green_pat[ds$epo_green_pat_epo7889>0 | ds$epo_green_pat_epo9099>0 | ds$epo_green_pat_epo0002>0 | ds$epo_green_pat_epo0304>0] <- 1 # A binary variable indicating whether or not the firm has ever filed a low-carbon patent at the EPO.
			ds$patent_count <- ds$epo_total_pat0002 + ds$epo_total_pat0304 # The number of patents filed in 2000-2004.
			ds$green_patent_count <- ds$epo_green_pat_epo0002 + ds$epo_green_pat_epo0304 # The number of low-carbon patents filed in 2000-2004.
		}
		if (i==3) { # Patent variables are measured up to and including 2002 only.
			ds$patents <- 0
			ds$patents[ds$epo_total_pat7889>0 | ds$epo_total_pat9099>0 | ds$epo_total_pat0002>0] <- 1
			ds$green_pat <- 0
			ds$green_pat[ds$epo_green_pat_epo7889>0 | ds$epo_green_pat_epo9099>0 | ds$epo_green_pat_epo0002>0] <- 1
			ds$patent_count <- ds$epo_total_pat0002
			ds$green_patent_count <- ds$epo_green_pat_epo0002
		}
		
		# Match
		zs <- ds$ETS
		Y <- ds$epo_green_pat_epo0507 # Match() requires an outcome variable as input to calculate default estimates of treatment effects. This variable is not used to construct matches, so an artibrary variable is given here.
		## The balance matrix includes some interaction terms and higher-order terms in order to better balance the joint distribution of covariates.
		if (i<2 | i>3) {
			Xs <- subset(ds, select = c(nace3,yearofincorporation,revenue0104,patents,green_pat,patent_count,green_patent_count))
			balance_matrix <- as.matrix(cbind(Xs, I(Xs$revenue0104^2), I(Xs$yearofincorporation*Xs$patents), I(Xs$yearofincorporation*Xs$patent_count)))
		}
		if (i==2) {
			Xs <- subset(ds, select = c(nace3,yearofincorporation,revenue0105,patents,green_pat,patent_count,green_patent_count))
			balance_matrix <- as.matrix(cbind(Xs, I(Xs$revenue0105), I(Xs$yearofincorporation*Xs$patents), I(Xs$yearofincorporation*Xs$patent_count)))
		}
		if (i==3) {
			Xs <- subset(ds, select = c(nace3,yearofincorporation,revenue0102,patents,green_pat,patent_count,green_patent_count))
			balance_matrix <- as.matrix(cbind(Xs, I(Xs$revenue0102^2), I(Xs$yearofincorporation*Xs$patents), I(Xs$yearofincorporation*Xs$patent_count)))
		}
		Xs <- as.matrix(Xs)
		gen <- GenMatch(Tr=zs, X=Xs, BalanceMatrix=balance_matrix, pop.size=500, caliper=c(100,100,0.1,100,100,0.1,0.1))
		exact <- rep(F, length(colnames(Xs)))
		exact[grep("patents", colnames(Xs))] <- T
		exact[grep("nace3", colnames(Xs))] <- T
		m <- Match(Y=Y, Tr=zs, X=Xs, Weight.matrix=gen, exact = exact)
		
		# Correct the index numbers to take account of smaller data set, so they can be easily retreived from the larger data set later
		m$index.treated <- as.numeric(rownames(ds[m$index.treated,]))
		m$index.control <- as.numeric(rownames(ds[m$index.control,]))
		m$index.dropped <- as.numeric(rownames(ds[m$index.dropped,]))
		rm("balance_matrix","exact","Xs","zs","Y")
	
		# Save matching output
		name1 <- paste(data[j], "_gen", i, sep="")
		assign(name1, gen)
		name2 <- paste(data[j], "_m", i, sep="")
		assign(name2, m)
		rm("m", "gen")
		filename <- paste("matched_data_", data[j], "_v", i, ".RData", sep="")
		save(list=c(name1, name2), file=filename) 
		rm(list=c("filename", "name1", "name2"))
	}
}

# Match co-patenters (specification 6, see above):
rm(list=ls())
load(file="matching_data_copatent.RData")

data <- c("AT", "BE", "CH", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "LT", "LU", "NL", "NO", "PT", "RO", "SE", "US") # Co-patenters may be located in any of the countries in our data set, not just in countries that launched the EU ETS in 2005. However, there are no co-patenters with sufficient data to match in BG, IE, PL, or SK, so these countries can be excluded here to improve computational efficiency without any change to the result.

for (j in seq_along(data)) {
	# Read in data
	ds <- get(paste(data[j], "_d", sep=""))

	# Exclude firms where revenue data is insufficient to match
	ds <- ds[ds$revenue0104!=-3,]

	# Create additional matching variables
	ds$patents <- 0
	ds$patents[ds$epo_total_pat7889>0 | ds$epo_total_pat9099>0 | ds$epo_total_pat0002>0 | ds$epo_total_pat0304>0] <- 1
	ds$green_pat <- 0
	ds$green_pat[ds$epo_green_pat_epo7889>0 | ds$epo_green_pat_epo9099>0 | ds$epo_green_pat_epo0002>0 | ds$epo_green_pat_epo0304>0] <- 1
	ds$patent_count <- ds$epo_total_pat0002 + ds$epo_total_pat0304
	ds$green_patent_count <- ds$epo_green_pat_epo0002 + ds$epo_green_pat_epo0304
	
	# Match
	zs <- ds$copatent
	Y <- ds$epo_green_pat_epo0507
	## The balance matrix includes some interaction and higher-order terms, to better balance the joint distribution of covariates.
	Xs <- subset(ds, select = c(nace3,yearofincorporation,revenue0104,patents,green_pat,patent_count,green_patent_count))
	balance_matrix <- as.matrix(cbind(Xs, I(Xs$revenue0104^2), I(Xs$yearofincorporation*Xs$patents), I(Xs$yearofincorporation*Xs$patent_count)))
	Xs <- as.matrix(Xs)
	gen <- GenMatch(Tr=zs, X=Xs, BalanceMatrix=balance_matrix, pop.size=500, caliper=c(100,100,0.1,100,100,0.1,0.1))
	exact <- rep(F, length(colnames(Xs)))
	exact[grep("patents", colnames(Xs))] <- T
	exact[grep("nace3", colnames(Xs))] <- T
	m <- Match(Y=Y, Tr=zs, X=Xs, Weight.matrix=gen, exact = exact)
	
	# Correct the index numbers to take account of smaller data set, so they can be easily retreived from the larger data set
	m$index.treated <- as.numeric(rownames(ds[m$index.treated,]))
	m$index.control <- as.numeric(rownames(ds[m$index.control,]))
	m$index.dropped <- as.numeric(rownames(ds[m$index.dropped,]))
	rm("balance_matrix","exact","Xs","zs","Y")

	# Save matching output
	name1 <- paste(data[j], "_gen1", sep="")
	assign(name1, gen)
	name2 <- paste(data[j], "_m1", sep="")
	assign(name2, m)
	rm("m", "gen")
	filename <- paste("matched_data_copatent_", data[j], "_v6.RData", sep="")
	save(list=c(name1, name2), file=filename) 
	rm(list=c("filename", "name1", "name2"))
}


############### Compile matched data sets ###############
# Matching above returns the index numbers of matched observations, which are now used to construct the matched data sets.
## Retrieve matched observations for specifications 1 through 5:
rm(list=ls())
for (i in 1:5) {
	if (i<3) { data <- c("AT", "BE", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "IE", "LT", "LU", "NL", "PL", "PT", "SE", "SK") }
	if (i==3) { data <- c("AT", "BE", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "IE", "LT", "NL", "PL", "PT", "SE", "SK") } # LU does not have any EU ETS firms with sufficient data to match under the matching criteria set for specification 3, so it is excluded.
	if (i==4) { data <- "nonETS" }
	if (i==5) { data <- "US" }
	for (j in seq_along(data)) {
		filename <- paste("matched_data_", data[j], "_v", i,".RData", sep="")
		load(filename)
		rm("filename")
	}
}
rm(i, j, data)
load(file="matching_data.RData")

for (i in 1:5) {
	if (i<3) { data <- c("AT", "BE", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "IE", "LT", "LU", "NL", "PL", "PT", "SE", "SK") }
	if (i==3) { data <- c("AT", "BE", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "IE", "LT", "NL", "PL", "PT", "SE", "SK") } # LU does not have any EU ETS firms with sufficient data to match under the matching criteria set for specification 3, so it is excluded.
	if (i==4) { data <- "nonETS" }
	if (i==5) { data <- "US" }
	treated <- data.frame()
	control <- data.frame()
	for (j in seq_along(data)) {
		country_d <- get(paste(data[j], "_d", sep=""))
		country_m <- get(paste(data[j], "_m", i, sep=""))
		treated <- rbind(treated, country_d[floor(country_m$index.treated),])
		control <- rbind(control, country_d[floor(country_m$index.control),])
	}
	name1 <- paste("treated_v", i, sep="")
	assign(name1, treated)
	name2 <- paste("control_v", i, sep="")
	assign(name2, control)
	rm(treated, control, country_d, country_m)
}

## Add number of inventor locations in 2000-2004 to v1 (this variable was added later at a reviewer's suggestion, and is analysed in appendix E)
x <- read.dta("BVDID_inventor_locations_00_04_oldstata.dta")
names(x) <- c("bvdidnumber", "inv_loc")
x <- x[x$bvdidnumber %in% (unique(rbind(treated_v1$bvdidnumber, control_v1$bvdidnumber))),]
treated_v1 <- merge(treated_v1, x, by="bvdidnumber", all.x=T)
treated_v1$inv_loc[is.na(treated_v1$inv_loc)] <- 0 # If no innovation locations are recorded, set to zero as default...
treated_v1$inv_loc[treated_v1$inv_loc==0 & treated_v1$epo_green_pat_epo0002+treated_v1$epo_green_pat_epo0304>0] <- 1 # ... or to one, whenever the firm had filed at least one low-carbon patent.
control_v1$bvdidnumber_treated <- treated_v1$bvdidnumber
control_v1 <- merge(control_v1, x, by="bvdidnumber", all.x=T)
control_v1 <- control_v1[order(control_v1$bvdidnumber_treated),]
control_v1 <- control_v1[,-grep("bvdidnumber_treated", names(control_v1))]
control_v1$inv_loc[is.na(control_v1$inv_loc)] <- 0
control_v1$inv_loc[control_v1$inv_loc==0 & control_v1$epo_green_pat_epo0002+control_v1$epo_green_pat_epo0304>0] <- 1
rm(x)

## Add average revenue growth in 2000-2004 to v1 (this variable was added later at a reviewer's suggestion, and is analysed in appendix E)
x <- read.dta("full_matched_sample_turnover.dta")
names(x) <- c("bvdidnumber", "ets", "avg_growth", "avg_growth_available")
x <- subset(x, select=c("bvdidnumber", "avg_growth_available"))
names(x) <- c("bvdidnumber", "avg_growth")
treated_v1 <- merge(treated_v1, x, by="bvdidnumber", all.x=T)
control_v1$bvdidnumber_treated <- treated_v1$bvdidnumber
control_v1 <- merge(control_v1, x, by="bvdidnumber", all.x=T)
control_v1 <- control_v1[order(control_v1$bvdidnumber_treated),]
control_v1 <- control_v1[,-grep("bvdidnumber_treated", names(control_v1))]
rm(x)

## Average 'ties' for specifications 1 through 5:
for (i in 1:5) {
	treated <- data.frame()
	control <- data.frame()
	name1 <- paste("treated_v", i, sep="")
	name2 <- paste("control_v", i, sep="")
	if (i==1) {
		treated <- subset(get(name1), select=c("ETS", "ETS_related", "epo_green_pat_epo7889", "epo_green_pat_epo9099", "epo_green_pat_epo0002", "epo_green_pat_epo0304", "epo_green_pat_epo0507", "epo_green_pat_epo0809", "epo_green_pat_epoipc7889", "epo_green_pat_epoipc9099", "epo_green_pat_epoipc0002", "epo_green_pat_epoipc0304", "epo_green_pat_epoipc0507", "epo_green_pat_epoipc0809", "epo_pct_pat7889", "epo_pct_pat9099", "epo_pct_pat0002", "epo_pct_pat0304", "epo_pct_pat0507", "epo_pct_pat0809", "epo_total_pat7889", "epo_total_pat9099", "epo_total_pat0002", "epo_total_pat0304", "epo_total_pat0507", "epo_total_pat0809", "epo_green_pat_epo_fam7889", "epo_green_pat_epo_fam9099", "epo_green_pat_epo_fam0002", "epo_green_pat_epo_fam0304", "epo_green_pat_epo_fam0507", "epo_green_pat_epo_fam0809", "epo_green_pat_epoipc_fam7889", "epo_green_pat_epoipc_fam9099", "epo_green_pat_epoipc_fam0002", "epo_green_pat_epoipc_fam0304", "epo_green_pat_epoipc_fam0507", "epo_green_pat_epoipc_fam0809", "epo_pct_pat_fam7889", "epo_pct_pat_fam9099", "epo_pct_pat_fam0002", "epo_pct_pat_fam0304", "epo_pct_pat_fam0507", "epo_pct_pat_fam0809", "epo_total_pat_fam7889", "epo_total_pat_fam9099", "epo_total_pat_fam0002", "epo_total_pat_fam0304", "epo_total_pat_fam0507", "epo_total_pat_fam0809", "epo_green_pat_epo_cit7889", "epo_green_pat_epo_cit9099", "epo_green_pat_epo_cit0002", "epo_green_pat_epo_cit0304", "epo_green_pat_epo_cit0507", "epo_green_pat_epo_cit0809", "epo_green_pat_epoipc_cit7889", "epo_green_pat_epoipc_cit9099", "epo_green_pat_epoipc_cit0002", "epo_green_pat_epoipc_cit0304", "epo_green_pat_epoipc_cit0507", "epo_green_pat_epoipc_cit0809", "epo_pct_pat_cit7889", "epo_pct_pat_cit9099", "epo_pct_pat_cit0002", "epo_pct_pat_cit0304", "epo_pct_pat_cit0507", "epo_pct_pat_cit0809", "epo_total_pat_cit7889", "epo_total_pat_cit9099", "epo_total_pat_cit0002", "epo_total_pat_cit0304", "epo_total_pat_cit0507", "epo_total_pat_cit0809", "yearofincorporation", "revenue0102", "revenue0104", "revenue0105", "employees0104", "nace3", "inv_loc", "avg_growth")) # For completeness, the data set contains additional permutations of patent variable specification not used in our analysis.
		treated[treated==-3] <- NA # Missing values were previously coded as -3
		treated <- aggregate(treated, list(get(name1)$bvdidnumber), mean, na.rm=TRUE) # Whenever there are ties in the control set below, there will be repeated entries of the same treated firm in the treated set. Averging by ID number in the treated set is therefore equivalent to removing duplicate rows.
		control <- subset(get(name2), select=c("ETS", "ETS_related", "epo_green_pat_epo7889", "epo_green_pat_epo9099", "epo_green_pat_epo0002", "epo_green_pat_epo0304", "epo_green_pat_epo0507", "epo_green_pat_epo0809", "epo_green_pat_epoipc7889", "epo_green_pat_epoipc9099", "epo_green_pat_epoipc0002", "epo_green_pat_epoipc0304", "epo_green_pat_epoipc0507", "epo_green_pat_epoipc0809", "epo_pct_pat7889", "epo_pct_pat9099", "epo_pct_pat0002", "epo_pct_pat0304", "epo_pct_pat0507", "epo_pct_pat0809", "epo_total_pat7889", "epo_total_pat9099", "epo_total_pat0002", "epo_total_pat0304", "epo_total_pat0507", "epo_total_pat0809", "epo_green_pat_epo_fam7889", "epo_green_pat_epo_fam9099", "epo_green_pat_epo_fam0002", "epo_green_pat_epo_fam0304", "epo_green_pat_epo_fam0507", "epo_green_pat_epo_fam0809", "epo_green_pat_epoipc_fam7889", "epo_green_pat_epoipc_fam9099", "epo_green_pat_epoipc_fam0002", "epo_green_pat_epoipc_fam0304", "epo_green_pat_epoipc_fam0507", "epo_green_pat_epoipc_fam0809", "epo_pct_pat_fam7889", "epo_pct_pat_fam9099", "epo_pct_pat_fam0002", "epo_pct_pat_fam0304", "epo_pct_pat_fam0507", "epo_pct_pat_fam0809", "epo_total_pat_fam7889", "epo_total_pat_fam9099", "epo_total_pat_fam0002", "epo_total_pat_fam0304", "epo_total_pat_fam0507", "epo_total_pat_fam0809", "epo_green_pat_epo_cit7889", "epo_green_pat_epo_cit9099", "epo_green_pat_epo_cit0002", "epo_green_pat_epo_cit0304", "epo_green_pat_epo_cit0507", "epo_green_pat_epo_cit0809", "epo_green_pat_epoipc_cit7889", "epo_green_pat_epoipc_cit9099", "epo_green_pat_epoipc_cit0002", "epo_green_pat_epoipc_cit0304", "epo_green_pat_epoipc_cit0507", "epo_green_pat_epoipc_cit0809", "epo_pct_pat_cit7889", "epo_pct_pat_cit9099", "epo_pct_pat_cit0002", "epo_pct_pat_cit0304", "epo_pct_pat_cit0507", "epo_pct_pat_cit0809", "epo_total_pat_cit7889", "epo_total_pat_cit9099", "epo_total_pat_cit0002", "epo_total_pat_cit0304", "epo_total_pat_cit0507", "epo_total_pat_cit0809", "yearofincorporation", "revenue0102", "revenue0104", "revenue0105", "employees0104", "nace3", "inv_loc", "avg_growth")) # Since firms are matched exactly for nace3, an average across control firms matched to the same treated firm will reproduce the same 3-digit NACE code.
		control[control==-3] <- NA
		control <- aggregate(control, list(get(name1)$bvdidnumber), mean, na.rm=TRUE) # Whenever multiple controls have matched to a single treated firm, we take the average values for the control set to be our best counterfactual estimate.
	}
	if (i>1) {
		treated <- subset(get(name1), select=c("ETS", "ETS_related", "epo_green_pat_epo7889", "epo_green_pat_epo9099", "epo_green_pat_epo0002", "epo_green_pat_epo0304", "epo_green_pat_epo0507", "epo_green_pat_epo0809", "epo_green_pat_epoipc7889", "epo_green_pat_epoipc9099", "epo_green_pat_epoipc0002", "epo_green_pat_epoipc0304", "epo_green_pat_epoipc0507", "epo_green_pat_epoipc0809", "epo_pct_pat7889", "epo_pct_pat9099", "epo_pct_pat0002", "epo_pct_pat0304", "epo_pct_pat0507", "epo_pct_pat0809", "epo_total_pat7889", "epo_total_pat9099", "epo_total_pat0002", "epo_total_pat0304", "epo_total_pat0507", "epo_total_pat0809", "epo_green_pat_epo_fam7889", "epo_green_pat_epo_fam9099", "epo_green_pat_epo_fam0002", "epo_green_pat_epo_fam0304", "epo_green_pat_epo_fam0507", "epo_green_pat_epo_fam0809", "epo_green_pat_epoipc_fam7889", "epo_green_pat_epoipc_fam9099", "epo_green_pat_epoipc_fam0002", "epo_green_pat_epoipc_fam0304", "epo_green_pat_epoipc_fam0507", "epo_green_pat_epoipc_fam0809", "epo_pct_pat_fam7889", "epo_pct_pat_fam9099", "epo_pct_pat_fam0002", "epo_pct_pat_fam0304", "epo_pct_pat_fam0507", "epo_pct_pat_fam0809", "epo_total_pat_fam7889", "epo_total_pat_fam9099", "epo_total_pat_fam0002", "epo_total_pat_fam0304", "epo_total_pat_fam0507", "epo_total_pat_fam0809", "epo_green_pat_epo_cit7889", "epo_green_pat_epo_cit9099", "epo_green_pat_epo_cit0002", "epo_green_pat_epo_cit0304", "epo_green_pat_epo_cit0507", "epo_green_pat_epo_cit0809", "epo_green_pat_epoipc_cit7889", "epo_green_pat_epoipc_cit9099", "epo_green_pat_epoipc_cit0002", "epo_green_pat_epoipc_cit0304", "epo_green_pat_epoipc_cit0507", "epo_green_pat_epoipc_cit0809", "epo_pct_pat_cit7889", "epo_pct_pat_cit9099", "epo_pct_pat_cit0002", "epo_pct_pat_cit0304", "epo_pct_pat_cit0507", "epo_pct_pat_cit0809", "epo_total_pat_cit7889", "epo_total_pat_cit9099", "epo_total_pat_cit0002", "epo_total_pat_cit0304", "epo_total_pat_cit0507", "epo_total_pat_cit0809","yearofincorporation",  "revenue0102", "revenue0104", "revenue0105", "employees0104", "nace3")) # For completeness, the data set contains additional permutations of patent variable specification not used in our analysis.
		treated[treated==-3] <- NA # Missing values were previously coded as -3
		treated <- aggregate(treated, list(get(name1)$bvdidnumber), mean, na.rm=TRUE) # Whenever there are ties in the control set below, there will be repeated entries of the same treated firm in the treated set. Averging by ID number in the treated set is therefore equivalent to removing duplicate rows.
		control <- subset(get(name2), select=c("ETS", "ETS_related", "epo_green_pat_epo7889", "epo_green_pat_epo9099", "epo_green_pat_epo0002", "epo_green_pat_epo0304", "epo_green_pat_epo0507", "epo_green_pat_epo0809", "epo_green_pat_epoipc7889", "epo_green_pat_epoipc9099", "epo_green_pat_epoipc0002", "epo_green_pat_epoipc0304", "epo_green_pat_epoipc0507", "epo_green_pat_epoipc0809", "epo_pct_pat7889", "epo_pct_pat9099", "epo_pct_pat0002", "epo_pct_pat0304", "epo_pct_pat0507", "epo_pct_pat0809", "epo_total_pat7889", "epo_total_pat9099", "epo_total_pat0002", "epo_total_pat0304", "epo_total_pat0507", "epo_total_pat0809", "epo_green_pat_epo_fam7889", "epo_green_pat_epo_fam9099", "epo_green_pat_epo_fam0002", "epo_green_pat_epo_fam0304", "epo_green_pat_epo_fam0507", "epo_green_pat_epo_fam0809", "epo_green_pat_epoipc_fam7889", "epo_green_pat_epoipc_fam9099", "epo_green_pat_epoipc_fam0002", "epo_green_pat_epoipc_fam0304", "epo_green_pat_epoipc_fam0507", "epo_green_pat_epoipc_fam0809", "epo_pct_pat_fam7889", "epo_pct_pat_fam9099", "epo_pct_pat_fam0002", "epo_pct_pat_fam0304", "epo_pct_pat_fam0507", "epo_pct_pat_fam0809", "epo_total_pat_fam7889", "epo_total_pat_fam9099", "epo_total_pat_fam0002", "epo_total_pat_fam0304", "epo_total_pat_fam0507", "epo_total_pat_fam0809", "epo_green_pat_epo_cit7889", "epo_green_pat_epo_cit9099", "epo_green_pat_epo_cit0002", "epo_green_pat_epo_cit0304", "epo_green_pat_epo_cit0507", "epo_green_pat_epo_cit0809", "epo_green_pat_epoipc_cit7889", "epo_green_pat_epoipc_cit9099", "epo_green_pat_epoipc_cit0002", "epo_green_pat_epoipc_cit0304", "epo_green_pat_epoipc_cit0507", "epo_green_pat_epoipc_cit0809", "epo_pct_pat_cit7889", "epo_pct_pat_cit9099", "epo_pct_pat_cit0002", "epo_pct_pat_cit0304", "epo_pct_pat_cit0507", "epo_pct_pat_cit0809", "epo_total_pat_cit7889", "epo_total_pat_cit9099", "epo_total_pat_cit0002", "epo_total_pat_cit0304", "epo_total_pat_cit0507", "epo_total_pat_cit0809", "yearofincorporation", "revenue0102", "revenue0104", "revenue0105", "employees0104", "nace3")) # For completeness, the data set contains additional permutations of patent variable specification not used in our analysis. Since firms are matched exactly for nace3, an average will  reproduce the same value.
		control[control==-3] <- NA
		control <- aggregate(control, list(get(name1)$bvdidnumber), mean, na.rm=TRUE) # Whenever multiple controls have matched to a single treated firm, we take the average values for the control set to be our best counterfactual estimate.
	}
	name3 <- paste("treated_w", i, sep="")
	assign(name3, treated)
	name4 <- paste("control_w", i, sep="")
	assign(name4, control)
	rm(treated, control)
}
rm(list=ls()[-c(grep("treated_", ls()), grep("control_", ls()))])
save.image("matched_data_compiled.RData")

## Retrieve matched observations for specification 6:
rm(list = ls())
data <- c("AT", "BE", "CH", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "LT", "LU", "NL", "NO", "PT", "RO", "SE", "US")
for (j in seq_along(data)) {
	filename <- paste("matched_data_copatent_", data[j], "_v6.RData", sep="")
	load(filename)
	rm("filename")
}
rm(j)
load(file="matching_data_copatent.RData")

treated <- data.frame()
control <- data.frame()
for (j in seq_along(data)) {
	country_d <- get(paste(data[j], "_d", sep=""))
	country_m <- get(paste(data[j], "_m1", sep=""))
	treated <- rbind(treated, country_d[floor(country_m$index.treated),])
	control <- rbind(control, country_d[floor(country_m$index.control),])
}
name1 <- c("treated_v6")
assign(name1, treated)
name2 <- c("control_v6")
assign(name2, control)
rm(list=ls()[-c(grep("treated_v", ls()), grep("control_v", ls()))])

## Average 'ties' for specification 6:
treated <- data.frame()
control <- data.frame()
name1 <- c("treated_v6")
name2 <- c("control_v6")
treated <- subset(get(name1), select=c("copatent", "copatent_green", "yearofincorporation", "ETS", "ETS_related", "epo_green_pat_epo7889", "epo_green_pat_epo9099", "epo_green_pat_epo0002", "epo_green_pat_epo0304", "epo_green_pat_epo0507", "epo_green_pat_epo0809", "epo_green_pat_epoipc7889", "epo_green_pat_epoipc9099", "epo_green_pat_epoipc0002", "epo_green_pat_epoipc0304", "epo_green_pat_epoipc0507", "epo_green_pat_epoipc0809", "epo_pct_pat7889", "epo_pct_pat9099", "epo_pct_pat0002", "epo_pct_pat0304", "epo_pct_pat0507", "epo_pct_pat0809", "epo_total_pat7889", "epo_total_pat9099", "epo_total_pat0002", "epo_total_pat0304", "epo_total_pat0507", "epo_total_pat0809", "epo_green_pat_epo_fam7889", "epo_green_pat_epo_fam9099", "epo_green_pat_epo_fam0002", "epo_green_pat_epo_fam0304", "epo_green_pat_epo_fam0507", "epo_green_pat_epo_fam0809", "epo_green_pat_epoipc_fam7889", "epo_green_pat_epoipc_fam9099", "epo_green_pat_epoipc_fam0002", "epo_green_pat_epoipc_fam0304", "epo_green_pat_epoipc_fam0507", "epo_green_pat_epoipc_fam0809", "epo_pct_pat_fam7889", "epo_pct_pat_fam9099", "epo_pct_pat_fam0002", "epo_pct_pat_fam0304", "epo_pct_pat_fam0507", "epo_pct_pat_fam0809", "epo_total_pat_fam7889", "epo_total_pat_fam9099", "epo_total_pat_fam0002", "epo_total_pat_fam0304", "epo_total_pat_fam0507", "epo_total_pat_fam0809", "epo_green_pat_epo_cit7889", "epo_green_pat_epo_cit9099", "epo_green_pat_epo_cit0002", "epo_green_pat_epo_cit0304", "epo_green_pat_epo_cit0507", "epo_green_pat_epo_cit0809", "epo_green_pat_epoipc_cit7889", "epo_green_pat_epoipc_cit9099", "epo_green_pat_epoipc_cit0002", "epo_green_pat_epoipc_cit0304", "epo_green_pat_epoipc_cit0507", "epo_green_pat_epoipc_cit0809", "epo_pct_pat_cit7889", "epo_pct_pat_cit9099", "epo_pct_pat_cit0002", "epo_pct_pat_cit0304", "epo_pct_pat_cit0507", "epo_pct_pat_cit0809", "epo_total_pat_cit7889", "epo_total_pat_cit9099", "epo_total_pat_cit0002", "epo_total_pat_cit0304", "epo_total_pat_cit0507", "epo_total_pat_cit0809", "revenue0102", "revenue0104", "revenue0105", "employees0104", "nace3"))
treated[treated==-3] <- NA
treated <- aggregate(treated, list(get(name1)$bvdidnumber), mean, na.rm=TRUE)
control <- subset(get(name2), select=c("copatent", "copatent_green", "yearofincorporation", "ETS", "ETS_related", "epo_green_pat_epo7889", "epo_green_pat_epo9099", "epo_green_pat_epo0002", "epo_green_pat_epo0304", "epo_green_pat_epo0507", "epo_green_pat_epo0809", "epo_green_pat_epoipc7889", "epo_green_pat_epoipc9099", "epo_green_pat_epoipc0002", "epo_green_pat_epoipc0304", "epo_green_pat_epoipc0507", "epo_green_pat_epoipc0809", "epo_pct_pat7889", "epo_pct_pat9099", "epo_pct_pat0002", "epo_pct_pat0304", "epo_pct_pat0507", "epo_pct_pat0809", "epo_total_pat7889", "epo_total_pat9099", "epo_total_pat0002", "epo_total_pat0304", "epo_total_pat0507", "epo_total_pat0809", "epo_green_pat_epo_fam7889", "epo_green_pat_epo_fam9099", "epo_green_pat_epo_fam0002", "epo_green_pat_epo_fam0304", "epo_green_pat_epo_fam0507", "epo_green_pat_epo_fam0809", "epo_green_pat_epoipc_fam7889", "epo_green_pat_epoipc_fam9099", "epo_green_pat_epoipc_fam0002", "epo_green_pat_epoipc_fam0304", "epo_green_pat_epoipc_fam0507", "epo_green_pat_epoipc_fam0809", "epo_pct_pat_fam7889", "epo_pct_pat_fam9099", "epo_pct_pat_fam0002", "epo_pct_pat_fam0304", "epo_pct_pat_fam0507", "epo_pct_pat_fam0809", "epo_total_pat_fam7889", "epo_total_pat_fam9099", "epo_total_pat_fam0002", "epo_total_pat_fam0304", "epo_total_pat_fam0507", "epo_total_pat_fam0809", "epo_green_pat_epo_cit7889", "epo_green_pat_epo_cit9099", "epo_green_pat_epo_cit0002", "epo_green_pat_epo_cit0304", "epo_green_pat_epo_cit0507", "epo_green_pat_epo_cit0809", "epo_green_pat_epoipc_cit7889", "epo_green_pat_epoipc_cit9099", "epo_green_pat_epoipc_cit0002", "epo_green_pat_epoipc_cit0304", "epo_green_pat_epoipc_cit0507", "epo_green_pat_epoipc_cit0809", "epo_pct_pat_cit7889", "epo_pct_pat_cit9099", "epo_pct_pat_cit0002", "epo_pct_pat_cit0304", "epo_pct_pat_cit0507", "epo_pct_pat_cit0809", "epo_total_pat_cit7889", "epo_total_pat_cit9099", "epo_total_pat_cit0002", "epo_total_pat_cit0304", "epo_total_pat_cit0507", "epo_total_pat_cit0809", "revenue0102", "revenue0104", "revenue0105", "employees0104", "nace3"))
control[control==-3] <- NA
control <- aggregate(control, list(get(name1)$bvdidnumber), mean, na.rm=TRUE)
name3 <- c("treated_w6")
assign(name3, treated)
name4 <- c("control_w6")
assign(name4, control)
rm(list=ls()[-c(grep("treated_", ls()), grep("control_", ls()))])
save.image("matched_data_copatent_compiled.RData")

## Collect all matched data sets into one R workspace
rm(list=ls())
load("matched_data_compiled.RData")
load("matched_data_copatent_compiled.RData")
rm(list=ls()[-c(grep("treated_", ls()), grep("control_", ls()))])
save.image("matched_data.RData")


# Impose additional restrictions on match quality to arrive at final matched samples
## For specification 1:
### Recreate matching variables
treated_w1$patents <- 0
treated_w1$patents[treated_w1$epo_total_pat7889>0 | treated_w1$epo_total_pat9099>0 | treated_w1$epo_total_pat0002>0 | treated_w1$epo_total_pat0304>0] <- 1
treated_w1$green_pat <- 0
treated_w1$green_pat[treated_w1$epo_green_pat_epo7889>0 | treated_w1$epo_green_pat_epo9099>0 | treated_w1$epo_green_pat_epo0002>0 | treated_w1$epo_green_pat_epo0304>0] <- 1
treated_w1$patent_count <- treated_w1$epo_total_pat0002 + treated_w1$epo_total_pat0304
treated_w1$green_patent_count <- treated_w1$epo_green_pat_epo0002 + treated_w1$epo_green_pat_epo0304
control_w1$patents <- 0
control_w1$patents[control_w1$epo_total_pat7889>0 | control_w1$epo_total_pat9099>0 | control_w1$epo_total_pat0002>0 | control_w1$epo_total_pat0304>0] <- 1
control_w1$green_pat <- 0
control_w1$green_pat[control_w1$epo_green_pat_epo7889>0 | control_w1$epo_green_pat_epo9099>0 | control_w1$epo_green_pat_epo0002>0 | control_w1$epo_green_pat_epo0304>0] <- 1
control_w1$patent_count <- control_w1$epo_total_pat0002 + control_w1$epo_total_pat0304
control_w1$green_patent_count <- control_w1$epo_green_pat_epo0002 + control_w1$epo_green_pat_epo0304

### Impose additional restrictions on match quality
conditions <- (log(treated_w1$green_patent_count+1) - log(control_w1$green_patent_count+1)) < log(15) & (log(treated_w1$patent_count+1) - log(control_w1$patent_count+1)) < log(20)
ts_w1 <- subset(treated_w1, conditions)
cs_w1 <- subset(control_w1, conditions)
ts_1 <- ts_w1[ts_w1$revenue0104>0 & cs_w1$revenue0104>0,]
cs_w1 <- cs_w1[ts_w1$revenue0104>0 & cs_w1$revenue0104>0,]
ts_w1 <- ts_1
rm(ts_1)

### Retrieve firms in selected matched sample
control_v1_caliper <- control_v1
control_v1_caliper$match <- treated_v1$bvdidnumber
control_v1_caliper <- subset(control_v1_caliper, control_v1_caliper$match %in% ts_w1$Group.1)
control_v1_caliper <- aggregate(control_v1_caliper, list(control_v1_caliper$bvdidnumber), mean, na.rm=TRUE)

### Total number of EU ETS and non-EU ETS firms in our selected matched sample
nrow(ts_w1) # 3428 matched EU ETS firms, as reported in section 4.1.
nrow(control_v1_caliper) # 4373 matched non-EU ETS firms, as reported in section 4.1.

### Save list of matched firms, which is used to construct aggregate patent time series in "descriptive" analysed in 2-Descriptive_statistics.R, and retreive patents (in "ETSpatents") and patent counts (in "ETSpatent_counts") analysed in 4-Analysis_of_matched_data.R.
matched_sample <- as.data.frame(rbind(cbind(ts_w1$Group.1, rep.int(1,nrow(ts_w1))), cbind(control_v1_caliper$Group.1, rep.int(0,nrow(control_v1_caliper)))))
colnames(matched_sample) <- c("bvdidnumber","ETS")
write.csv(matched_sample, "matched_sample.csv")

### Add variable indicating whether or not the EU ETS firm in a matched group operated at least one installation that was opted-in to the EU ETS.
optins <- read.dta("CITL_BVD_table_multiple_matches_fixed_upadatedBVD.dta")
ts_w1$optin <- 1
ts_w1$optin[ts_w1$Group.1 %in% setdiff(ts_w1$Group.1, unique(optins$bvdid[optins$main_activity_type_code==99]))] <- 0
cs_w1$optin <- 1
cs_w1$optin[ts_w1$Group.1 %in% setdiff(ts_w1$Group.1, unique(optins$bvdid[optins$main_activity_type_code==99]))] <- 0
rm(optins)

## For specification 2:
treated_w2$patents <- 0
treated_w2$patents[treated_w2$epo_total_pat7889>0 | treated_w2$epo_total_pat9099>0 | treated_w2$epo_total_pat0002>0 | treated_w2$epo_total_pat0304>0] <- 1
treated_w2$green_pat <- 0
treated_w2$green_pat[treated_w2$epo_green_pat_epo7889>0 | treated_w2$epo_green_pat_epo9099>0 | treated_w2$epo_green_pat_epo0002>0 | treated_w2$epo_green_pat_epo0304>0] <- 1
treated_w2$patent_count <- treated_w2$epo_total_pat0002 + treated_w2$epo_total_pat0304
treated_w2$green_patent_count <- treated_w2$epo_green_pat_epo0002 + treated_w2$epo_green_pat_epo0304	
control_w2$patents <- 0
control_w2$patents[control_w2$epo_total_pat7889>0 | control_w2$epo_total_pat9099>0 | control_w2$epo_total_pat0002>0 | control_w2$epo_total_pat0304>0] <- 1
control_w2$green_pat <- 0
control_w2$green_pat[control_w2$epo_green_pat_epo7889>0 | control_w2$epo_green_pat_epo9099>0 | control_w2$epo_green_pat_epo0002>0 | control_w2$epo_green_pat_epo0304>0] <- 1
control_w2$patent_count <- control_w2$epo_total_pat0002 + control_w2$epo_total_pat0304
control_w2$green_patent_count <- control_w2$epo_green_pat_epo0002 + control_w2$epo_green_pat_epo0304	
conditions <- (log(treated_w2$patent_count+1) - log(control_w2$patent_count+1)) < log(10)	
ts_w2 <- subset(treated_w2, conditions)
cs_w2 <- subset(control_w2, conditions)
ts_2 <- ts_w2[ts_w2$revenue0105>0 & cs_w2$revenue0105>0,]
cs_w2 <- cs_w2[ts_w2$revenue0105>0 & cs_w2$revenue0105>0,]
ts_w2 <- ts_2
rm(ts_2)
nrow(ts_w2) # 3855 matched groups, as reported in section 4.3.

## For specification 3:
treated_w3$patents <- 0
treated_w3$patents[treated_w3$epo_total_pat7889>0 | treated_w3$epo_total_pat9099>0 | treated_w3$epo_total_pat0002>0] <- 1
treated_w3$green_pat <- 0
treated_w3$green_pat[treated_w3$epo_green_pat_epo7889>0 | treated_w3$epo_green_pat_epo9099>0 | treated_w3$epo_green_pat_epo0002>0] <- 1
treated_w3$patent_count <- treated_w3$epo_total_pat0002
treated_w3$green_patent_count <- treated_w3$epo_green_pat_epo0002	
control_w3$patents <- 0
control_w3$patents[control_w3$epo_total_pat7889>0 | control_w3$epo_total_pat9099>0 | control_w3$epo_total_pat0002>0] <- 1
control_w3$green_pat <- 0
control_w3$green_pat[control_w3$epo_green_pat_epo7889>0 | control_w3$epo_green_pat_epo9099>0 | control_w3$epo_green_pat_epo0002>0] <- 1
control_w3$patent_count <- control_w3$epo_total_pat0002
control_w3$green_patent_count <- control_w3$epo_green_pat_epo0002	
conditions <- (log(treated_w3$green_patent_count+1) - log(control_w3$green_patent_count+1)) < log(15) & (log(treated_w3$patent_count+1) - log(control_w3$patent_count+1)) < log(20)
ts_w3 <- subset(treated_w3, conditions)
cs_w3 <- subset(control_w3, conditions)
ts_3 <- ts_w3[ts_w3$revenue0104>0 & cs_w3$revenue0104>0,]
cs_w3 <- cs_w3[ts_w3$revenue0104>0 & cs_w3$revenue0104>0,]
ts_w3 <- ts_3
rm(ts_3)

## For specification 4:
treated_w4$patents <- 0
treated_w4$patents[treated_w4$epo_total_pat7889>0 | treated_w4$epo_total_pat9099>0 | treated_w4$epo_total_pat0002>0 | treated_w4$epo_total_pat0304>0] <- 1
treated_w4$green_pat <- 0
treated_w4$green_pat[treated_w4$epo_green_pat_epo7889>0 | treated_w4$epo_green_pat_epo9099>0 | treated_w4$epo_green_pat_epo0002>0 | treated_w4$epo_green_pat_epo0304>0] <- 1
treated_w4$patent_count <- treated_w4$epo_total_pat0002 + treated_w4$epo_total_pat0304
treated_w4$green_patent_count <- treated_w4$epo_green_pat_epo0002 + treated_w4$epo_green_pat_epo0304	
control_w4$patents <- 0
control_w4$patents[control_w4$epo_total_pat7889>0 | control_w4$epo_total_pat9099>0 | control_w4$epo_total_pat0002>0 | control_w4$epo_total_pat0304>0] <- 1
control_w4$green_pat <- 0
control_w4$green_pat[control_w4$epo_green_pat_epo7889>0 | control_w4$epo_green_pat_epo9099>0 | control_w4$epo_green_pat_epo0002>0 | control_w4$epo_green_pat_epo0304>0] <- 1
control_w4$patent_count <- control_w4$epo_total_pat0002 + control_w4$epo_total_pat0304
control_w4$green_patent_count <- control_w4$epo_green_pat_epo0002 + control_w4$epo_green_pat_epo0304	
conditions <- (log(treated_w4$green_patent_count+1) - log(control_w4$green_patent_count+1)) < log(15) & (log(treated_w4$patent_count+1) - log(control_w4$patent_count+1)) < log(20)
ts_w4 <- subset(treated_w4, conditions)
cs_w4 <- subset(control_w4, conditions)
ts_4 <- ts_w4[ts_w4$revenue0104>0 & cs_w4$revenue0104>0,]
cs_w4 <- cs_w4[ts_w4$revenue0104>0 & cs_w4$revenue0104>0,]
ts_w4 <- ts_4
rm(ts_4)

## For specification 5:
treated_w5$patents <- 0
treated_w5$patents[treated_w5$epo_total_pat7889>0 | treated_w5$epo_total_pat9099>0 | treated_w5$epo_total_pat0002>0 | treated_w5$epo_total_pat0304>0] <- 1
treated_w5$green_pat <- 0
treated_w5$green_pat[treated_w5$epo_green_pat_epo7889>0 | treated_w5$epo_green_pat_epo9099>0 | treated_w5$epo_green_pat_epo0002>0 | treated_w5$epo_green_pat_epo0304>0] <- 1
treated_w5$patent_count <- treated_w5$epo_total_pat0002 + treated_w5$epo_total_pat0304
treated_w5$green_patent_count <- treated_w5$epo_green_pat_epo0002 + treated_w5$epo_green_pat_epo0304	
control_w5$patents <- 0
control_w5$patents[control_w5$epo_total_pat7889>0 | control_w5$epo_total_pat9099>0 | control_w5$epo_total_pat0002>0 | control_w5$epo_total_pat0304>0] <- 1
control_w5$green_pat <- 0
control_w5$green_pat[control_w5$epo_green_pat_epo7889>0 | control_w5$epo_green_pat_epo9099>0 | control_w5$epo_green_pat_epo0002>0 | control_w5$epo_green_pat_epo0304>0] <- 1
control_w5$patent_count <- control_w5$epo_total_pat0002 + control_w5$epo_total_pat0304
control_w5$green_patent_count <- control_w5$epo_green_pat_epo0002 + control_w5$epo_green_pat_epo0304	
conditions <- (log(treated_w5$green_patent_count+1) - log(control_w5$green_patent_count+1)) < log(15) & (log(treated_w5$patent_count+1) - log(control_w5$patent_count+1)) < log(20)
ts_w5 <- subset(treated_w5, conditions)
cs_w5 <- subset(control_w5, conditions)
ts_5 <- ts_w5[ts_w5$revenue0104>0 & cs_w5$revenue0104>0,]
cs_w5 <- cs_w5[ts_w5$revenue0104>0 & cs_w5$revenue0104>0,]
ts_w5 <- ts_5
rm(ts_5)

## For specification 6:
treated_w6$patents <- 0
treated_w6$patents[treated_w6$epo_total_pat7889>0 | treated_w6$epo_total_pat9099>0 | treated_w6$epo_total_pat0002>0 | treated_w6$epo_total_pat0304>0] <- 1
treated_w6$green_pat <- 0
treated_w6$green_pat[treated_w6$epo_green_pat_epo7889>0 | treated_w6$epo_green_pat_epo9099>0 | treated_w6$epo_green_pat_epo0002>0 | treated_w6$epo_green_pat_epo0304>0] <- 1
treated_w6$patent_count <- treated_w6$epo_total_pat0002 + treated_w6$epo_total_pat0304
treated_w6$green_patent_count <- treated_w6$epo_green_pat_epo0002 + treated_w6$epo_green_pat_epo0304	
control_w6$patents <- 0
control_w6$patents[control_w6$epo_total_pat7889>0 | control_w6$epo_total_pat9099>0 | control_w6$epo_total_pat0002>0 | control_w6$epo_total_pat0304>0] <- 1
control_w6$green_pat <- 0
control_w6$green_pat[control_w6$epo_green_pat_epo7889>0 | control_w6$epo_green_pat_epo9099>0 | control_w6$epo_green_pat_epo0002>0 | control_w6$epo_green_pat_epo0304>0] <- 1
control_w6$patent_count <- control_w6$epo_total_pat0002 + control_w6$epo_total_pat0304
control_w6$green_patent_count <- control_w6$epo_green_pat_epo0002 + control_w6$epo_green_pat_epo0304	
conditions <- (log(treated_w6$green_patent_count+1) - log(control_w6$green_patent_count+1)) < log(4) & (log(treated_w6$patent_count+1) - log(control_w6$patent_count+1)) < log(4)
ts_w6 <- subset(treated_w6, conditions)
cs_w6 <- subset(control_w6, conditions)
ts_6 <- ts_w6[ts_w6$revenue0104>0 & cs_w6$revenue0104>0,]
cs_w6 <- cs_w6[ts_w6$revenue0104>0 & cs_w6$revenue0104>0,]
ts_w6 <- ts_6
rm(ts_6)
control_v6_caliper <- control_v6
control_v6_caliper$match <- treated_v6$bvdidnumber
control_v6_caliper <- subset(control_v6_caliper, control_v6_caliper$match %in% ts_w6$Group.1)
control_v6_caliper <- aggregate(control_v6_caliper, list(control_v6_caliper$bvdidnumber), mean, na.rm=TRUE)
nrow(ts_w6) # 2784 matched co-patenters, as reported in section 5.
nrow(control_v6_caliper) # 19361 matched non-co-patenters, as reported in section 5.


############### Create public version ###############
for (i in 1:6) {
	sets <- c("ts_w", "cs_w")
	for (j in seq_along(sets)) {
		if (i==1) { ds <- subset(get(paste(sets[j], i, sep="")), select=c(ETS, epo_green_pat_epo7889, epo_green_pat_epo9099, epo_green_pat_epo0002, epo_green_pat_epo0304, epo_green_pat_epo0507, epo_green_pat_epo0809, epo_green_pat_epoipc7889, epo_green_pat_epoipc9099, epo_green_pat_epoipc0002, epo_green_pat_epoipc0304, epo_green_pat_epoipc0507, epo_green_pat_epoipc0809, epo_pct_pat7889, epo_pct_pat9099, epo_pct_pat0002, epo_pct_pat0304, epo_pct_pat0507, epo_pct_pat0809, epo_total_pat7889, epo_total_pat9099, epo_total_pat0002, epo_total_pat0304, epo_total_pat0507, epo_total_pat0809, epo_green_pat_epo_fam7889, epo_green_pat_epo_fam9099, epo_green_pat_epo_fam0002, epo_green_pat_epo_fam0304, epo_green_pat_epo_fam0507, epo_green_pat_epo_fam0809, epo_green_pat_epoipc_fam7889, epo_green_pat_epoipc_fam9099, epo_green_pat_epoipc_fam0002, epo_green_pat_epoipc_fam0304, epo_green_pat_epoipc_fam0507, epo_green_pat_epoipc_fam0809, epo_pct_pat_fam7889, epo_pct_pat_fam9099, epo_pct_pat_fam0002, epo_pct_pat_fam0304, epo_pct_pat_fam0507, epo_pct_pat_fam0809, epo_total_pat_fam7889, epo_total_pat_fam9099, epo_total_pat_fam0002, epo_total_pat_fam0304, epo_total_pat_fam0507, epo_total_pat_fam0809, epo_green_pat_epo_cit7889, epo_green_pat_epo_cit9099, epo_green_pat_epo_cit0002, epo_green_pat_epo_cit0304, epo_green_pat_epo_cit0507, epo_green_pat_epo_cit0809, epo_green_pat_epoipc_cit7889, epo_green_pat_epoipc_cit9099, epo_green_pat_epoipc_cit0002, epo_green_pat_epoipc_cit0304, epo_green_pat_epoipc_cit0507, epo_green_pat_epoipc_cit0809, epo_pct_pat_cit7889, epo_pct_pat_cit9099, epo_pct_pat_cit0002, epo_pct_pat_cit0304, epo_pct_pat_cit0507, epo_pct_pat_cit0809, epo_total_pat_cit7889, epo_total_pat_cit9099, epo_total_pat_cit0002, epo_total_pat_cit0304, epo_total_pat_cit0507, epo_total_pat_cit0809, patents, green_pat, patent_count, green_patent_count, optin, inv_loc)) }
		if (i>1 & i<6) { ds <- subset(get(paste(sets[j], i, sep="")), select=c(ETS, epo_green_pat_epo7889, epo_green_pat_epo9099, epo_green_pat_epo0002, epo_green_pat_epo0304, epo_green_pat_epo0507, epo_green_pat_epo0809, epo_green_pat_epoipc7889, epo_green_pat_epoipc9099, epo_green_pat_epoipc0002, epo_green_pat_epoipc0304, epo_green_pat_epoipc0507, epo_green_pat_epoipc0809, epo_pct_pat7889, epo_pct_pat9099, epo_pct_pat0002, epo_pct_pat0304, epo_pct_pat0507, epo_pct_pat0809, epo_total_pat7889, epo_total_pat9099, epo_total_pat0002, epo_total_pat0304, epo_total_pat0507, epo_total_pat0809, epo_green_pat_epo_fam7889, epo_green_pat_epo_fam9099, epo_green_pat_epo_fam0002, epo_green_pat_epo_fam0304, epo_green_pat_epo_fam0507, epo_green_pat_epo_fam0809, epo_green_pat_epoipc_fam7889, epo_green_pat_epoipc_fam9099, epo_green_pat_epoipc_fam0002, epo_green_pat_epoipc_fam0304, epo_green_pat_epoipc_fam0507, epo_green_pat_epoipc_fam0809, epo_pct_pat_fam7889, epo_pct_pat_fam9099, epo_pct_pat_fam0002, epo_pct_pat_fam0304, epo_pct_pat_fam0507, epo_pct_pat_fam0809, epo_total_pat_fam7889, epo_total_pat_fam9099, epo_total_pat_fam0002, epo_total_pat_fam0304, epo_total_pat_fam0507, epo_total_pat_fam0809, epo_green_pat_epo_cit7889, epo_green_pat_epo_cit9099, epo_green_pat_epo_cit0002, epo_green_pat_epo_cit0304, epo_green_pat_epo_cit0507, epo_green_pat_epo_cit0809, epo_green_pat_epoipc_cit7889, epo_green_pat_epoipc_cit9099, epo_green_pat_epoipc_cit0002, epo_green_pat_epoipc_cit0304, epo_green_pat_epoipc_cit0507, epo_green_pat_epoipc_cit0809, epo_pct_pat_cit7889, epo_pct_pat_cit9099, epo_pct_pat_cit0002, epo_pct_pat_cit0304, epo_pct_pat_cit0507, epo_pct_pat_cit0809, epo_total_pat_cit7889, epo_total_pat_cit9099, epo_total_pat_cit0002, epo_total_pat_cit0304, epo_total_pat_cit0507, epo_total_pat_cit0809, patents, green_pat, patent_count, green_patent_count)) }
		if (i==6) { ds <- subset(get(paste(sets[j], i, sep="")), select=c(copatent_green, copatent, ETS, epo_green_pat_epo7889, epo_green_pat_epo9099, epo_green_pat_epo0002, epo_green_pat_epo0304, epo_green_pat_epo0507, epo_green_pat_epo0809, epo_green_pat_epoipc7889, epo_green_pat_epoipc9099, epo_green_pat_epoipc0002, epo_green_pat_epoipc0304, epo_green_pat_epoipc0507, epo_green_pat_epoipc0809, epo_pct_pat7889, epo_pct_pat9099, epo_pct_pat0002, epo_pct_pat0304, epo_pct_pat0507, epo_pct_pat0809, epo_total_pat7889, epo_total_pat9099, epo_total_pat0002, epo_total_pat0304, epo_total_pat0507, epo_total_pat0809, epo_green_pat_epo_fam7889, epo_green_pat_epo_fam9099, epo_green_pat_epo_fam0002, epo_green_pat_epo_fam0304, epo_green_pat_epo_fam0507, epo_green_pat_epo_fam0809, epo_green_pat_epoipc_fam7889, epo_green_pat_epoipc_fam9099, epo_green_pat_epoipc_fam0002, epo_green_pat_epoipc_fam0304, epo_green_pat_epoipc_fam0507, epo_green_pat_epoipc_fam0809, epo_pct_pat_fam7889, epo_pct_pat_fam9099, epo_pct_pat_fam0002, epo_pct_pat_fam0304, epo_pct_pat_fam0507, epo_pct_pat_fam0809, epo_total_pat_fam7889, epo_total_pat_fam9099, epo_total_pat_fam0002, epo_total_pat_fam0304, epo_total_pat_fam0507, epo_total_pat_fam0809, epo_green_pat_epo_cit7889, epo_green_pat_epo_cit9099, epo_green_pat_epo_cit0002, epo_green_pat_epo_cit0304, epo_green_pat_epo_cit0507, epo_green_pat_epo_cit0809, epo_green_pat_epoipc_cit7889, epo_green_pat_epoipc_cit9099, epo_green_pat_epoipc_cit0002, epo_green_pat_epoipc_cit0304, epo_green_pat_epoipc_cit0507, epo_green_pat_epoipc_cit0809, epo_pct_pat_cit7889, epo_pct_pat_cit9099, epo_pct_pat_cit0002, epo_pct_pat_cit0304, epo_pct_pat_cit0507, epo_pct_pat_cit0809, epo_total_pat_cit7889, epo_total_pat_cit9099, epo_total_pat_cit0002, epo_total_pat_cit0304, epo_total_pat_cit0507, epo_total_pat_cit0809, patents, green_pat, patent_count, green_patent_count)) }
		assign(paste(sets[j], i, sep=""), ds)
	}
}
rm(list=ls()[-c(grep("ts_w", ls()), grep("cs_w", ls()))])
save.image("EUETS_innovation_public.RData")

# Add patent counts for matched and unmatched EU ETS firms
ETSpatent_counts <- read.dta("matched_v_unmatched.dta")
ETSpatent_counts <- subset(ETSpatent_counts, select=c("green_pat0004", "green_pat0509", "total_pat0004", "total_pat0509", "matched")) # Omit all data obtained under license.
save.image("EUETS_innovation_public.Rdata")

# Add list of low-carbon patents filed by EU ETS firms
ETSpatents <- read.dta("ETS_companies_patents_oldstata.dta")
ETSpatents <- subset(ETSpatents, select=c("appln_year", "appln_id", "green_pat_epo", "Ycode", "technology")) # Omit all data obtained under license.
ETSpatents <- subset(ETSpatents, appln_year<2010 & appln_year>2004 & green_pat_epo==1)
save.image("EUETS_innovation_public.Rdata")

# Add aggregate patent time series
descriptive <- read.csv("data_for_graphs.csv")
descriptive <- subste(descriptive, select=c("year", "tot_epo_pat", "tot_epo_green_pat_Y02", "tot_epo_green_pat_Y02plus", "tot_epo_pct_pat", "oil_price_2010_usd", "total_patents_orbis", "total_patents_ets", "greenpat_epo_ets", "greenpat_epo_cit_ets", "greenpat_epo_fam_ets", "total_patents_nonets", "greenpat_epo_nonets", "greenpat_epo_cit_nonets", "greenpat_epo_fam_nonets"))
save.image("EUETS_innovation_public.Rdata")

# Add information about EU ETS installations
ETSinstallations <- read.dta("CITL_BVD_table_multiple_matches_fixed_upadatedBVD.dta")
ETSinstallations$firm_id <- as.numeric(factor(ETSinstallations$bvdid)) # BvD ID numbers were obtained under a license agreement and cannot be disclosed. The link to EU ETS installations was constructed by the researchers, so we can at least group installations by an otherwise uninformative identifier without disclosing any information obtained under the license agreement.
ETSinstallations <- subset(ETSinstallations, select=c(registry_code, installation_name, citl_instid, citl_permitid, main_activity_type_code, allocated_2005, verified_emissions_2005, surrendered_units_2005, allocated_2006, verified_emissions_2006, surrendered_units_2006, allocated_2007, verified_emissions_2007, surrendered_units_2007, firm_id, matched)) # Omit all data obtained under license.
save.image("EUETS_innovation_public.Rdata")